{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 09 scikit-learn中的回归问题"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "from sklearn import datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "boston = datasets.load_boston()\n",
    "\n",
    "X = boston.data\n",
    "y = boston.target\n",
    "\n",
    "X = X[y < 50.0]\n",
    "y = y[y < 50.0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "(490, 13)"
     },
     "metadata": {},
     "execution_count": 17
    }
   ],
   "source": [
    "X.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from playML.model_selection import train_test_split\n",
    "\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, seed=666)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### scikit-learn中的线性回归"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)"
     },
     "metadata": {},
     "execution_count": 19
    }
   ],
   "source": [
    "from sklearn.linear_model import LinearRegression\n",
    "\n",
    "lin_reg = LinearRegression()\n",
    "lin_reg.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "array([-1.20354261e-01,  3.64423279e-02, -3.61493155e-02,  5.12978140e-02,\n       -1.15775825e+01,  3.42740062e+00, -2.32311760e-02, -1.19487594e+00,\n        2.60101728e-01, -1.40219119e-02, -8.35430488e-01,  7.80472852e-03,\n       -3.80923751e-01])"
     },
     "metadata": {},
     "execution_count": 20
    }
   ],
   "source": [
    "lin_reg.coef_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "34.117399723229845"
     },
     "metadata": {},
     "execution_count": 21
    }
   ],
   "source": [
    "lin_reg.intercept_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "0.8129794056212809"
     },
     "metadata": {},
     "execution_count": 22
    }
   ],
   "source": [
    "lin_reg.score(X_test, y_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### kNN Regressor\n",
    "Sklearn中使用kNN进行回归的方法"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import StandardScaler\n",
    "\n",
    "standardScaler = StandardScaler()\n",
    "standardScaler.fit(X_train, y_train)\n",
    "X_train_standard = standardScaler.transform(X_train)\n",
    "X_test_standard = standardScaler.transform(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "0.847923904906593"
     },
     "metadata": {},
     "execution_count": 24
    }
   ],
   "source": [
    "from sklearn.neighbors import KNeighborsRegressor\n",
    "\n",
    "knn_reg = KNeighborsRegressor()\n",
    "knn_reg.fit(X_train_standard, y_train)\n",
    "knn_reg.score(X_test_standard, y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": "Fitting 5 folds for each of 60 candidates, totalling 300 fits\n[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.\n[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.1s\n[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    0.4s finished\n"
    },
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "GridSearchCV(cv=None, error_score=nan,\n             estimator=KNeighborsRegressor(algorithm='auto', leaf_size=30,\n                                           metric='minkowski',\n                                           metric_params=None, n_jobs=None,\n                                           n_neighbors=5, p=2,\n                                           weights='uniform'),\n             iid='deprecated', n_jobs=-1,\n             param_grid=[{'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],\n                          'weights': ['uniform']},\n                         {'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],\n                          'p': [1, 2, 3, 4, 5], 'weights': ['distance']}],\n             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,\n             scoring=None, verbose=1)"
     },
     "metadata": {},
     "execution_count": 25
    }
   ],
   "source": [
    "from sklearn.model_selection import GridSearchCV\n",
    "\n",
    "param_grid = [\n",
    "    {\n",
    "        \"weights\": [\"uniform\"],\n",
    "        \"n_neighbors\": [i for i in range(1, 11)]\n",
    "    },\n",
    "    {\n",
    "        \"weights\": [\"distance\"],\n",
    "        \"n_neighbors\": [i for i in range(1, 11)],\n",
    "        \"p\": [i for i in range(1,6)]\n",
    "    }\n",
    "]\n",
    "\n",
    "knn_reg = KNeighborsRegressor()\n",
    "grid_search = GridSearchCV(knn_reg, param_grid, n_jobs=-1, verbose=1)\n",
    "grid_search.fit(X_train_standard, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "{'n_neighbors': 7, 'p': 1, 'weights': 'distance'}"
     },
     "metadata": {},
     "execution_count": 26
    }
   ],
   "source": [
    "grid_search.best_params_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "0.8121986929882669"
     },
     "metadata": {},
     "execution_count": 27
    }
   ],
   "source": [
    "# CV方法计算的分数\n",
    "grid_search.best_score_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "0.8703184399069476"
     },
     "metadata": {},
     "execution_count": 28
    }
   ],
   "source": [
    "# 网格搜索后计算的训练集计算的R2\n",
    "grid_search.best_estimator_.score(X_test_standard, y_test)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.6.8 64-bit ('anaconda': conda)",
   "language": "python",
   "name": "python36864bitanacondaconda085db2e8b14649f4b32196068f7124e9"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7-final"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}